IMPORTING LIBRARIES

In [870]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import warnings
In [871]:
warnings.filterwarnings('ignore')

LOADING THE DATASET

In [872]:
vehicle = pd.read_csv("C:/Users/HP/Downloads/vehicle-1.csv")
vehicle
Out[872]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
841 93 39.0 87.0 183.0 64.0 8 169.0 40.0 20.0 134 200.0 422.0 149.0 72.0 7.0 25.0 188.0 195 car
842 89 46.0 84.0 163.0 66.0 11 159.0 43.0 20.0 159 173.0 368.0 176.0 72.0 1.0 20.0 186.0 197 van
843 106 54.0 101.0 222.0 67.0 12 222.0 30.0 25.0 173 228.0 721.0 200.0 70.0 3.0 4.0 187.0 201 car
844 86 36.0 78.0 146.0 58.0 7 135.0 50.0 18.0 124 155.0 270.0 148.0 66.0 0.0 25.0 190.0 195 car
845 85 36.0 66.0 123.0 55.0 5 120.0 56.0 17.0 128 140.0 212.0 131.0 73.0 1.0 18.0 186.0 190 van

846 rows × 19 columns

In [873]:
#column description
vehicle.columns
Out[873]:
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
       'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
       'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
       'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
       'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
       'skewness_about.2', 'hollows_ratio', 'class'],
      dtype='object')
In [874]:
vehicle.head(10)
Out[874]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
5 107 NaN 106.0 172.0 50.0 6 255.0 26.0 28.0 169 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183 bus
6 97 43.0 73.0 173.0 65.0 6 153.0 42.0 19.0 143 176.0 361.0 172.0 66.0 13.0 1.0 200.0 204 bus
7 90 43.0 66.0 157.0 65.0 9 137.0 48.0 18.0 146 162.0 281.0 164.0 67.0 3.0 3.0 193.0 202 van
8 86 34.0 62.0 140.0 61.0 7 122.0 54.0 17.0 127 141.0 223.0 112.0 64.0 2.0 14.0 200.0 208 van
9 93 44.0 98.0 NaN 62.0 11 183.0 36.0 22.0 146 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204 car
In [875]:
vehicle.shape
Out[875]:
(846, 19)

There are 19 columns and 846 rows in the vehicleset

ATTRIBUTE DATATYPES

In [876]:
vehicle.dtypes
Out[876]:
compactness                      int64
circularity                    float64
distance_circularity           float64
radius_ratio                   float64
pr.axis_aspect_ratio           float64
max.length_aspect_ratio          int64
scatter_ratio                  float64
elongatedness                  float64
pr.axis_rectangularity         float64
max.length_rectangularity        int64
scaled_variance                float64
scaled_variance.1              float64
scaled_radius_of_gyration      float64
scaled_radius_of_gyration.1    float64
skewness_about                 float64
skewness_about.1               float64
skewness_about.2               float64
hollows_ratio                    int64
class                           object
dtype: object
In [877]:
#5 number summary
vehicle.describe().transpose()
Out[877]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.0 119.0
circularity 841.0 44.828775 6.152172 33.0 40.00 44.0 49.0 59.0
distance_circularity 842.0 82.110451 15.778292 40.0 70.00 80.0 98.0 112.0
radius_ratio 840.0 168.888095 33.520198 104.0 141.00 167.0 195.0 333.0
pr.axis_aspect_ratio 844.0 61.678910 7.891463 47.0 57.00 61.0 65.0 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.0 55.0
scatter_ratio 845.0 168.901775 33.214848 112.0 147.00 157.0 198.0 265.0
elongatedness 845.0 40.933728 7.816186 26.0 33.00 43.0 46.0 61.0
pr.axis_rectangularity 843.0 20.582444 2.592933 17.0 19.00 20.0 23.0 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.0 188.0
scaled_variance 843.0 188.631079 31.411004 130.0 167.00 179.0 217.0 320.0
scaled_variance.1 844.0 439.494076 176.666903 184.0 318.00 363.5 587.0 1018.0
scaled_radius_of_gyration 844.0 174.709716 32.584808 109.0 149.00 173.5 198.0 268.0
scaled_radius_of_gyration.1 842.0 72.447743 7.486190 59.0 67.00 71.5 75.0 135.0
skewness_about 840.0 6.364286 4.920649 0.0 2.00 6.0 9.0 22.0
skewness_about.1 845.0 12.602367 8.936081 0.0 5.00 11.0 19.0 41.0
skewness_about.2 845.0 188.919527 6.155809 176.0 184.00 188.0 193.0 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.0 211.0

CHECKING THE PRESENCE OF MISSING VALUES

In [878]:
vehicle.isnull().sum()
Out[878]:
compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64

There are missing values present in the vehicleset

So we replace it with the median of that particular column

In [879]:
vehicle['circularity'].fillna(vehicle['circularity'].median(),inplace=True)
vehicle['distance_circularity'].fillna(vehicle['distance_circularity'].median(),inplace=True)
vehicle['radius_ratio'].fillna(vehicle['radius_ratio'].median(),inplace=True)
vehicle['pr.axis_aspect_ratio'].fillna(vehicle['pr.axis_aspect_ratio'].median(),inplace=True)
vehicle['scatter_ratio'].fillna(vehicle['scatter_ratio'].median(),inplace=True)
vehicle['elongatedness'].fillna(vehicle['elongatedness'].median(),inplace=True)
vehicle['pr.axis_rectangularity'].fillna(vehicle['pr.axis_rectangularity'].median(),inplace=True)
vehicle['skewness_about.2'].fillna(vehicle['skewness_about.2'].median(),inplace=True)
vehicle['skewness_about.1'].fillna(vehicle['skewness_about.1'].median(),inplace=True)
vehicle['skewness_about'].fillna(vehicle['skewness_about'].median(),inplace=True)
vehicle['scaled_radius_of_gyration.1'].fillna(vehicle['scaled_radius_of_gyration.1'].median(),inplace=True)
vehicle['scaled_radius_of_gyration'].fillna(vehicle['scaled_radius_of_gyration'].median(),inplace=True)
vehicle['scaled_variance'].fillna(vehicle['scaled_variance'].median(),inplace=True)
vehicle['scaled_variance.1'].fillna(vehicle['scaled_variance.1'].median(),inplace=True)
In [880]:
vehicle.isnull().sum()
Out[880]:
compactness                    0
circularity                    0
distance_circularity           0
radius_ratio                   0
pr.axis_aspect_ratio           0
max.length_aspect_ratio        0
scatter_ratio                  0
elongatedness                  0
pr.axis_rectangularity         0
max.length_rectangularity      0
scaled_variance                0
scaled_variance.1              0
scaled_radius_of_gyration      0
scaled_radius_of_gyration.1    0
skewness_about                 0
skewness_about.1               0
skewness_about.2               0
hollows_ratio                  0
class                          0
dtype: int64

There are no missing values present now.

CHECKING THE PRESENCE OF THE OUTLIERS IN THE DATASET.

In [881]:
sns.boxplot(vehicle['compactness'])
Out[881]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b133ca0c8>
In [882]:
sns.boxplot(vehicle['circularity'])
Out[882]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b133d1648>
In [883]:
sns.boxplot(vehicle['distance_circularity'])
Out[883]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b128b2bc8>
In [884]:
sns.boxplot(vehicle['radius_ratio'])
Out[884]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b128e1c08>
In [885]:
sns.boxplot(vehicle['pr.axis_aspect_ratio'])
Out[885]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b12135288>
In [886]:
sns.boxplot(vehicle['max.length_aspect_ratio'])
Out[886]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b1218ffc8>
In [887]:
sns.boxplot(vehicle['scatter_ratio'])
Out[887]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b121ef488>
In [888]:
sns.boxplot(vehicle['elongatedness'])
Out[888]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b1224eec8>
In [889]:
sns.boxplot(vehicle['pr.axis_rectangularity'])
Out[889]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b122bce88>
In [890]:
sns.boxplot(vehicle['max.length_rectangularity'])
Out[890]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b123021c8>
In [891]:
sns.boxplot(vehicle['scaled_variance'])
Out[891]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b12396ac8>
In [892]:
sns.boxplot(vehicle['scaled_variance.1'])
Out[892]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b123e3a88>
In [893]:
sns.boxplot(vehicle['scaled_radius_of_gyration'])
Out[893]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b1244aa88>
In [894]:
sns.boxplot(vehicle['scaled_radius_of_gyration.1'])
Out[894]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b124a3148>
In [895]:
sns.boxplot(vehicle['skewness_about'])
Out[895]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b12539d48>
In [896]:
sns.boxplot(vehicle['skewness_about.1'])
Out[896]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b12592608>
In [897]:
sns.boxplot(vehicle['skewness_about.2'])
Out[897]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b125f4d88>
In [898]:
sns.boxplot(vehicle['hollows_ratio'])
Out[898]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b125b7e48>

Very few columns that are 'skewness_about','radius_ratio','pr.axis_aspect_ratio','max.length_aspect_ratio' and 'scaled_radius_of_gyration.1' have outliers '

Replacing the outliers with median.

In [901]:
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'skewness_about': d})

median = df['skewness_about'].median()
std = df['skewness_about'].std()
outliers = (df['skewness_about'] - median).abs() > std
df[outliers] = np.nan
df['skewness_about'].fillna(median, inplace=True)
In [902]:
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'radius_ratio': d})

median = df['radius_ratio'].median()
std = df['radius_ratio'].std()
outliers = (df['radius_ratio'] - median).abs() > std
df[outliers] = np.nan
df['radius_ratio'].fillna(median, inplace=True)
In [903]:
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'pr.axis_aspect_ratio': d})

median = df['pr.axis_aspect_ratio'].median()
std = df['pr.axis_aspect_ratio'].std()
outliers = (df['pr.axis_aspect_ratio'] - median).abs() > std
df[outliers] = np.nan
df['pr.axis_aspect_ratio'].fillna(median, inplace=True)
In [904]:
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'max.length_aspect_ratio': d})

median = df['max.length_aspect_ratio'].median()
std = df['max.length_aspect_ratio'].std()
outliers = (df['max.length_aspect_ratio'] - median).abs() > std
df[outliers] = np.nan
df['max.length_aspect_ratio'].fillna(median, inplace=True)
In [905]:
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'scaled_radius_of_gyration.1': d})

median = df['scaled_radius_of_gyration.1'].median()
std = df['scaled_radius_of_gyration.1'].std()
outliers = (df['scaled_radius_of_gyration.1'] - median).abs() > std
df[outliers] = np.nan
df['scaled_radius_of_gyration.1'].fillna(median, inplace=True)

Checking the distribution of the dependent variable: class column

In [906]:
vehicle["class"].value_counts(normalize=True) 
Out[906]:
car    0.507092
bus    0.257683
van    0.235225
Name: class, dtype: float64

Visualizing the distribution of target column

In [907]:
pd.value_counts(vehicle["class"]).plot(kind="bar")
Out[907]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b126f1cc8>

we can clearly see that the number of cars are relatively higher in number(around 51%).The rest 50 is equally divided into the category bus and van(26 and 23 respecively)

In [908]:
vehicle.groupby(["class"]).count() 
Out[908]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
class
bus 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218 218
car 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429 429
van 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199 199

PAIRPLOT VISULATION TO SEE THE RELATIONSHIP BETWEEN VARIOUS ATTRIBUTES

In [909]:
# independant variables
X = vehicle.drop(['class'], axis=1)
# the dependent variable
y = vehicle[['class']]

sns.pairplot(X, diag_kind='kde')   
Out[909]:
<seaborn.axisgrid.PairGrid at 0x19b12737ec8>

Some attributes have a strong postive relationship with each other as we can see in the graph.#In addition there are few variables which generate a very distributed scatter plot which means there is no significant relationship at all.

HEATMAP TO SEE THE CORRELATION BETWEEN ATTRIBUTES.

In [910]:
plt.figure(figsize=(15,15))
sns.heatmap(X.corr(),annot=True)
Out[910]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b1f52a108>

The last five columns do not seem to have a strong positive correlation between them though hollows ratio and skweness about 2 have correrelation of 0.89.

The other attributes have good correrelation with each other as we can see in the heat map

STANDARDISING THE DATA USING ZSCORE

In [911]:
df = vehicle.drop('class', axis=1)
In [912]:
from scipy.stats import zscore
z= df.apply(zscore)
In [913]:
z.head()
Out[913]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
0 0.160580 0.518073 0.057177 0.273363 1.310398 0.311542 -0.207598 0.136262 -0.224342 0.758332 -0.401920 -0.341934 0.285705 -0.327326 -0.073812 0.380870 -0.312012 0.183957
1 -0.325470 -0.623732 0.120741 -0.835032 -0.593753 0.094079 -0.599423 0.520519 -0.610886 -0.344578 -0.593357 -0.619724 -0.513630 -0.059384 0.538390 0.156798 0.013265 0.452977
2 1.254193 0.844303 1.519141 1.202018 0.548738 0.311542 1.148719 -1.144597 0.935290 0.689401 1.097671 1.109379 1.392477 0.074587 1.558727 -0.403383 -0.149374 0.049447
3 -0.082445 -0.623732 -0.006386 -0.295813 0.167907 0.094079 -0.750125 0.648605 -0.610886 -0.344578 -0.912419 -0.738777 -1.466683 -1.265121 -0.073812 -0.291347 1.639649 1.529056
4 -1.054545 -0.134387 -0.769150 1.082192 5.245643 9.444962 -0.599423 0.520519 -0.610886 -0.275646 1.671982 -0.648070 0.408680 7.309005 0.538390 -0.179311 -1.450481 -1.699181

SPLITTING THE DATA INTO TRAIN AND TEST IN THE RATIO OF 70 AND 30.

In [952]:
from sklearn.model_selection import train_test_split

X,y = np.array(vehicle)[ :, 0:18], np.array(vehicle['class'])[:]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
In [915]:
#Checking the split of vehicle

print("{0:0.2f}% vehicle is in training set".format((len(x_train)/len(vehicle.index)) * 100))
print("{0:0.2f}% vehicle is in test set".format((len(x_test)/len(vehicle.index)) * 100))
69.98% vehicle is in training set
30.02% vehicle is in test set

IMPLEMENTATING THE SUPPORT VECTOR MACHINE

In [925]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
vehicle['class'] = le.fit_transform(vehicle['class'])
In [926]:
from sklearn import svm
clr = svm.SVC()
clr.fit(x_train , y_train)
Out[926]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
In [927]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x]== predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0
In [928]:
y_pred = clr.predict(x_test)
In [929]:
getAccuracy(y_test, y_pred)
Out[929]:
52.75590551181102

IMPLEMENTATING THE KFOLD CROSS VALIDATION

In [951]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

array = vehicle.values
X2 = array[:,0:18]
Y2 = array[:,18]

X_train, X_test, Y_train, Y_test = train_test_split(X2, Y2, test_size=0.50, random_state=1)

num_folds = 10
seed = 10

kfold = KFold(n_splits=num_folds, random_state=seed)
model = DecisionTreeRegressor()
results = cross_val_score(model, X2, Y2, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[0.79586936 0.88815789 0.78459199 0.85184263 0.73152243 0.5998655
 0.5998639  0.67557129 0.86348862 0.66656489]
Accuracy: 74.573% (10.168%)

IMPLEMENTING THE PCA MODULE.

In [931]:
cov = np.cov(z,rowvar=False)
print(cov)
[[ 1.00118343  0.68569786  0.79086299  0.69055952  0.09164265  0.14842463
   0.81358214 -0.78968322  0.81465658  0.67694334  0.76297234  0.81497566
   0.58593517 -0.24988794  0.23635777  0.15720044  0.29889034  0.36598446]
 [ 0.68569786  1.00118343  0.79325751  0.6216467   0.15396023  0.25176438
   0.8489411  -0.82244387  0.84439802  0.96245572  0.79724837  0.83693508
   0.92691166  0.05200785  0.14436828 -0.01145212 -0.10455005  0.04640562]
 [ 0.79086299  0.79325751  1.00118343  0.76794246  0.15864319  0.26499957
   0.90614687 -0.9123854   0.89408198  0.77544391  0.86253904  0.88706577
   0.70660663 -0.22621115  0.1140589   0.26586088  0.14627113  0.33312625]
 [ 0.69055952  0.6216467   0.76794246  1.00118343  0.66423242  0.45058426
   0.73529816 -0.79041561  0.70922371  0.56962256  0.79435372  0.71928618
   0.53700678 -0.18061084  0.04877032  0.17394649  0.38266622  0.47186659]
 [ 0.09164265  0.15396023  0.15864319  0.66423242  1.00118343  0.64949139
   0.10385472 -0.18325156  0.07969786  0.1270594   0.27323306  0.08929427
   0.12211524  0.15313091 -0.05843967 -0.0320139   0.24016968  0.26804208]
 [ 0.14842463  0.25176438  0.26499957  0.45058426  0.64949139  1.00118343
   0.16638787 -0.18035326  0.16169312  0.30630475  0.31933428  0.1434227
   0.18996732  0.29608463  0.01561769  0.04347324 -0.02611148  0.14408905]
 [ 0.81358214  0.8489411   0.90614687  0.73529816  0.10385472  0.16638787
   1.00118343 -0.97275069  0.99092181  0.81004084  0.94978498  0.9941867
   0.80082111 -0.02757446  0.07454578  0.21267959  0.00563439  0.1189581 ]
 [-0.78968322 -0.82244387 -0.9123854  -0.79041561 -0.18325156 -0.18035326
  -0.97275069  1.00118343 -0.95011894 -0.77677186 -0.93748998 -0.95494487
  -0.76722075  0.10342428 -0.05266193 -0.18527244 -0.11526213 -0.2171615 ]
 [ 0.81465658  0.84439802  0.89408198  0.70922371  0.07969786  0.16169312
   0.99092181 -0.95011894  1.00118343  0.81189327  0.93533261  0.98938264
   0.79763248 -0.01551372  0.08386628  0.21495454 -0.01867064  0.09940372]
 [ 0.67694334  0.96245572  0.77544391  0.56962256  0.1270594   0.30630475
   0.81004084 -0.77677186  0.81189327  1.00118343  0.74586628  0.79555492
   0.86747579  0.04167099  0.13601231  0.00136727 -0.10407076  0.07686047]
 [ 0.76297234  0.79724837  0.86253904  0.79435372  0.27323306  0.31933428
   0.94978498 -0.93748998  0.93533261  0.74586628  1.00118343  0.94679667
   0.77983844  0.11321163  0.03677248  0.19446837  0.01423606  0.08579656]
 [ 0.81497566  0.83693508  0.88706577  0.71928618  0.08929427  0.1434227
   0.9941867  -0.95494487  0.98938264  0.79555492  0.94679667  1.00118343
   0.79595778 -0.01541878  0.07696823  0.20104818  0.00622636  0.10305714]
 [ 0.58593517  0.92691166  0.70660663  0.53700678  0.12211524  0.18996732
   0.80082111 -0.76722075  0.79763248  0.86747579  0.77983844  0.79595778
   1.00118343  0.19169941  0.16667971 -0.05621953 -0.22471583 -0.11814142]
 [-0.24988794  0.05200785 -0.22621115 -0.18061084  0.15313091  0.29608463
  -0.02757446  0.10342428 -0.01551372  0.04167099  0.11321163 -0.01541878
   0.19169941  1.00118343 -0.08846001 -0.12633227 -0.749751   -0.80307227]
 [ 0.23635777  0.14436828  0.1140589   0.04877032 -0.05843967  0.01561769
   0.07454578 -0.05266193  0.08386628  0.13601231  0.03677248  0.07696823
   0.16667971 -0.08846001  1.00118343 -0.03503155  0.1154338   0.09724079]
 [ 0.15720044 -0.01145212  0.26586088  0.17394649 -0.0320139   0.04347324
   0.21267959 -0.18527244  0.21495454  0.00136727  0.19446837  0.20104818
  -0.05621953 -0.12633227 -0.03503155  1.00118343  0.07740174  0.20523257]
 [ 0.29889034 -0.10455005  0.14627113  0.38266622  0.24016968 -0.02611148
   0.00563439 -0.11526213 -0.01867064 -0.10407076  0.01423606  0.00622636
  -0.22471583 -0.749751    0.1154338   0.07740174  1.00118343  0.89363767]
 [ 0.36598446  0.04640562  0.33312625  0.47186659  0.26804208  0.14408905
   0.1189581  -0.2171615   0.09940372  0.07686047  0.08579656  0.10305714
  -0.11814142 -0.80307227  0.09724079  0.20523257  0.89363767  1.00118343]]
In [932]:
from sklearn.decomposition import PCA
pca = PCA(n_components=18)
pca.fit(z)
Out[932]:
PCA(copy=True, iterated_power='auto', n_components=18, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [933]:
print(pca.explained_variance_)
[9.40460261e+00 3.01492206e+00 1.90352502e+00 1.17993747e+00
 9.17260633e-01 5.39992629e-01 3.58870118e-01 2.21932456e-01
 1.60608597e-01 9.18572234e-02 6.64994118e-02 4.66005994e-02
 3.57947189e-02 2.74120657e-02 2.05792871e-02 1.79166314e-02
 1.00257898e-02 2.96445743e-03]
In [934]:
print(pca.components_)
[[ 2.75283688e-01  2.93258469e-01  3.04609128e-01  2.67606877e-01
   8.05039890e-02  9.72756855e-02  3.17092750e-01 -3.14133155e-01
   3.13959064e-01  2.82830900e-01  3.09280359e-01  3.13788457e-01
   2.72047492e-01 -2.08137692e-02  4.14555082e-02  5.82250207e-02
   3.02795063e-02  7.41453913e-02]
 [-1.26953763e-01  1.25576727e-01 -7.29516436e-02 -1.89634378e-01
  -1.22174860e-01  1.07482875e-02  4.81181371e-02  1.27498515e-02
   5.99352482e-02  1.16220532e-01  6.22806229e-02  5.37843596e-02
   2.09233172e-01  4.88525148e-01 -5.50899716e-02 -1.24085090e-01
  -5.40914775e-01 -5.40354258e-01]
 [-1.19922479e-01 -2.48205467e-02 -5.60143254e-02  2.75074211e-01
   6.42012966e-01  5.91801304e-01 -9.76283108e-02  5.76484384e-02
  -1.09512416e-01 -1.70641987e-02  5.63239801e-02 -1.08840729e-01
  -3.14636493e-02  2.86277015e-01 -1.15679354e-01 -7.52828901e-02
   8.73592034e-03  3.95242743e-02]
 [ 7.83843562e-02  1.87337408e-01 -7.12008427e-02 -4.26053415e-02
   3.27257119e-02  3.14147277e-02 -9.57485748e-02  8.22901952e-02
  -9.24582989e-02  1.88005612e-01 -1.19844008e-01 -9.17449325e-02
   2.00095228e-01 -6.55051354e-02  6.04794251e-01 -6.66114117e-01
   1.05526253e-01  4.74890311e-02]
 [ 6.95178336e-02 -8.50649539e-02  4.06645651e-02 -4.61473714e-02
  -4.05494487e-02  2.13432566e-01 -1.54853055e-02  7.68518712e-02
   2.17633157e-03 -6.06366845e-02 -4.56472367e-04 -1.95548315e-02
  -6.15991681e-02  1.45530146e-01  7.29189842e-01  5.99196401e-01
  -1.00602332e-01 -2.98614819e-02]
 [ 1.44875476e-01 -3.02731148e-01 -1.38405773e-01  2.48136636e-01
   2.36932611e-01 -4.19330747e-01  1.16100153e-01 -1.41840112e-01
   9.80561329e-02 -4.61674972e-01  2.36225434e-01  1.57820194e-01
  -1.35576278e-01  2.41356821e-01  2.03209257e-01 -1.91960802e-01
   1.56939174e-01 -2.41222817e-01]
 [ 4.51862331e-01 -2.49103387e-01  7.40350569e-02 -1.76912814e-01
  -3.97876601e-01  5.03413610e-01  6.49879382e-02  1.38112945e-02
   9.66573058e-02 -1.04552173e-01  1.14622578e-01  8.37350220e-02
  -3.73944382e-01  1.11952983e-01 -8.06328902e-02 -2.84558723e-01
   1.81451818e-02  1.57237839e-02]
 [-5.66136785e-01 -1.79851809e-01  4.34748988e-01  1.01998360e-01
  -6.87147927e-02  1.61153097e-01  1.00688056e-01 -2.15497166e-01
   6.35933915e-02 -2.49495867e-01  5.02096319e-02  4.37649907e-02
  -1.08474496e-01 -3.40878491e-01  1.56487670e-01 -2.08774083e-01
  -3.04580219e-01 -3.04186304e-02]
 [-4.84418105e-01 -1.41569001e-02 -1.67572478e-01 -2.30313563e-01
  -2.77128307e-01  1.48032250e-01  5.44574214e-02 -1.56867362e-01
   5.24978759e-03 -6.10362445e-02  2.97588112e-01  8.33669838e-02
   2.41655483e-01  3.20221887e-01  2.21054148e-02  1.01761758e-02
   5.17222779e-01  1.71506343e-01]
 [-2.60076393e-01  9.80779086e-02 -2.05031597e-01 -4.77888949e-02
   1.08075009e-01 -1.18266345e-01  1.65167200e-01 -1.51612333e-01
   1.93777917e-01  4.69059999e-01 -1.29986011e-01  1.58203940e-01
  -6.86493700e-01  1.27648385e-01  9.83643219e-02 -3.55150608e-02
   1.93956186e-02  6.41314778e-02]
 [ 4.65342885e-02  3.01323693e-03  7.06489498e-01 -1.07151583e-01
   3.85169721e-02 -2.62254132e-01 -1.70405800e-01 -5.76632611e-02
  -2.72514033e-01  1.41434233e-01  7.72596638e-02 -2.43226301e-01
  -1.58888394e-01  4.19188664e-01 -1.25447648e-02 -3.27808069e-02
   1.20597635e-01  9.19597847e-02]
 [ 1.20344026e-02 -2.13635088e-01  3.46330345e-04 -1.57049977e-01
   1.10106595e-01 -1.32935328e-01  9.55883216e-02  1.22012715e-01
   2.51281206e-01 -1.24529334e-01 -2.15011644e-01  1.75685051e-01
   1.90336498e-01  2.85710601e-01 -1.60327156e-03 -8.32589542e-02
  -3.53723696e-01  6.85618161e-01]
 [ 1.56136836e-01  1.50116709e-02 -2.37111452e-01 -3.07818692e-02
  -3.92804479e-02  3.72884301e-02  3.94638419e-02 -8.10394855e-01
  -2.71573184e-01 -7.57105808e-02 -1.53180808e-01 -3.07948154e-01
   3.76087492e-02  4.34650674e-02  9.94304634e-03  2.68915150e-02
  -1.86595152e-01  1.42380007e-01]
 [-6.00485194e-02  4.26993118e-01 -1.46240270e-01  5.21374718e-01
  -3.63120360e-01 -6.27796802e-02 -6.40502241e-02  1.86946145e-01
  -1.80912790e-01 -1.74070296e-01  2.77272123e-01 -7.85141734e-02
  -2.00683948e-01  1.46861607e-01  1.73360301e-02 -3.13689218e-02
  -2.31451048e-01  2.88502234e-01]
 [-9.67780251e-03 -5.97862837e-01 -1.57257142e-01  1.66551725e-01
  -6.36138719e-02 -8.63169844e-02 -7.98693109e-02  4.21515054e-02
  -1.44490635e-01  5.11259153e-01  4.53236855e-01 -1.26992250e-01
   1.09982525e-01 -1.11271959e-01  2.40943096e-02 -9.89651885e-03
  -1.82212045e-01  9.04014702e-02]
 [-6.50956666e-02 -2.61244802e-01  7.82651714e-02  5.60792139e-01
  -3.22276873e-01  4.87809642e-02  1.81839668e-02 -2.50330194e-02
   1.64490784e-01  1.47280090e-01 -5.64444637e-01 -6.85856929e-02
   1.47099233e-01  2.32941262e-01 -2.77589170e-02  2.78187408e-03
   1.90629960e-01 -1.20966490e-01]
 [ 6.00532537e-03 -7.38059396e-02  2.50791236e-02  3.59880417e-02
  -1.25847434e-02  2.84168792e-02  2.49652703e-01  4.21478467e-02
  -7.17396292e-01  4.70233017e-02 -1.71503771e-01  6.16589383e-01
   2.64910290e-02  1.42959461e-02 -1.74310271e-03  7.08894692e-03
  -7.67874680e-03 -6.37681817e-03]
 [-1.00728764e-02 -9.15939674e-03  6.94599696e-03 -4.20156482e-02
   3.12698087e-02 -9.99915816e-03  8.40975659e-01  2.38188639e-01
  -1.01154594e-01 -1.69481636e-02  6.04665108e-03 -4.69202757e-01
   1.17483082e-02  3.14812146e-03 -3.03156233e-03 -1.25315953e-02
   4.34282436e-02 -6.47700819e-03]]
In [935]:
print(pca.explained_variance_ratio_)
[5.21860337e-01 1.67297684e-01 1.05626388e-01 6.54745969e-02
 5.08986889e-02 2.99641300e-02 1.99136623e-02 1.23150069e-02
 8.91215289e-03 5.09714695e-03 3.69004485e-03 2.58586200e-03
 1.98624491e-03 1.52109243e-03 1.14194232e-03 9.94191854e-04
 5.56329946e-04 1.64497408e-04]
In [936]:
plt.bar(list(range(0,18)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
In [937]:
plt.step(list(range(0,18)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()

we can see there is a sharp elbow around x=6 so we take n_components =6

DIMENSIONALITY REDUCTION.

In [938]:
pca2 = PCA(n_components=6)
pca2.fit(z)
print(pca2.components_)
print(pca2.explained_variance_ratio_)
Xpca = pca2.transform(z)
[[ 2.75283688e-01  2.93258469e-01  3.04609128e-01  2.67606877e-01
   8.05039890e-02  9.72756855e-02  3.17092750e-01 -3.14133155e-01
   3.13959064e-01  2.82830900e-01  3.09280359e-01  3.13788457e-01
   2.72047492e-01 -2.08137692e-02  4.14555082e-02  5.82250207e-02
   3.02795063e-02  7.41453913e-02]
 [-1.26953763e-01  1.25576727e-01 -7.29516436e-02 -1.89634378e-01
  -1.22174860e-01  1.07482875e-02  4.81181371e-02  1.27498515e-02
   5.99352482e-02  1.16220532e-01  6.22806229e-02  5.37843596e-02
   2.09233172e-01  4.88525148e-01 -5.50899716e-02 -1.24085090e-01
  -5.40914775e-01 -5.40354258e-01]
 [-1.19922479e-01 -2.48205467e-02 -5.60143254e-02  2.75074210e-01
   6.42012966e-01  5.91801304e-01 -9.76283108e-02  5.76484384e-02
  -1.09512416e-01 -1.70641987e-02  5.63239801e-02 -1.08840729e-01
  -3.14636493e-02  2.86277015e-01 -1.15679354e-01 -7.52828901e-02
   8.73592034e-03  3.95242743e-02]
 [ 7.83843562e-02  1.87337408e-01 -7.12008427e-02 -4.26053415e-02
   3.27257119e-02  3.14147277e-02 -9.57485748e-02  8.22901952e-02
  -9.24582990e-02  1.88005612e-01 -1.19844008e-01 -9.17449325e-02
   2.00095228e-01 -6.55051354e-02  6.04794251e-01 -6.66114117e-01
   1.05526253e-01  4.74890311e-02]
 [ 6.95178336e-02 -8.50649539e-02  4.06645651e-02 -4.61473714e-02
  -4.05494487e-02  2.13432566e-01 -1.54853055e-02  7.68518712e-02
   2.17633164e-03 -6.06366845e-02 -4.56472348e-04 -1.95548316e-02
  -6.15991681e-02  1.45530146e-01  7.29189842e-01  5.99196401e-01
  -1.00602332e-01 -2.98614819e-02]
 [ 1.44875476e-01 -3.02731149e-01 -1.38405773e-01  2.48136636e-01
   2.36932611e-01 -4.19330746e-01  1.16100154e-01 -1.41840112e-01
   9.80561298e-02 -4.61674972e-01  2.36225433e-01  1.57820197e-01
  -1.35576278e-01  2.41356821e-01  2.03209257e-01 -1.91960802e-01
   1.56939174e-01 -2.41222817e-01]]
[0.52186034 0.16729768 0.10562639 0.0654746  0.05089869 0.02996413]
In [939]:
Xpca
Out[939]:
array([[ 3.34162030e-01, -2.19026358e-01,  1.00158417e+00,
         1.76612370e-01,  7.93007079e-02, -7.57446693e-01],
       [-1.59171085e+00, -4.20602982e-01, -3.69033854e-01,
         2.33234117e-01,  6.93948582e-01, -5.17161832e-01],
       [ 3.76932418e+00,  1.95282752e-01,  8.78587404e-02,
         1.20221219e+00,  7.31732265e-01,  7.05041038e-01],
       ...,
       [ 4.80917387e+00, -1.24931049e-03,  5.32333105e-01,
         2.95652324e-01, -1.34423635e+00, -2.17069763e-01],
       [-3.29409242e+00, -1.00827615e+00, -3.57003198e-01,
        -1.93367514e+00,  4.27680051e-02, -4.02491278e-01],
       [-4.76505347e+00,  3.34899728e-01, -5.68136078e-01,
        -1.22480708e+00, -5.40510367e-02, -3.35637136e-01]])
In [954]:
sns.pairplot(pd.DataFrame(Xpca))
Out[954]:
<seaborn.axisgrid.PairGrid at 0x19b22149c88>

REPEATING 3,4,5 STEPS USING PCA DATA.

SPLITTING PCA DATA INTO TRAIN AND TEST SET

In [941]:
X,y = np.array(Xpca)[ :, 0:6], np.array(vehicle['class'])[:]  #Note that the X variable contains PCA data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
In [942]:
print("{0:0.2f}% vehicle is in training set".format((len(x_train)/len(vehicle.index)) * 100))
print("{0:0.2f}% vehicle is in test set".format((len(x_test)/len(vehicle.index)) * 100))
69.98% vehicle is in training set
30.02% vehicle is in test set

IMPLEMENTING SVM ON THE PCA DATA.

In [943]:
from sklearn import svm
clf = svm.SVC ()
clf.fit(x_train , y_train)
Out[943]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
In [944]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x]== predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0
In [945]:
y_pred = clf.predict(x_test)
In [946]:
getAccuracy(y_test , y_pred)   
Out[946]:
86.61417322834646
In [947]:
from sklearn import metrics
import seaborn as sns

cm=metrics.confusion_matrix(y_test, y_pred)

df_cm = pd.DataFrame(cm)
                  
plt.figure(figsize = (5,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
Out[947]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b220ed188>

IMPLEMENTING KFOLD CROSS VALIDATION ON THE PCA DATA.

In [953]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

num_folds = 10
seed = 7


kfold = KFold(n_splits=num_folds, random_state=seed)
regression_model = DecisionTreeRegressor()
results = cross_val_score(regression_model, X, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[0.77545629 0.88815789 0.84921439 0.83067729 0.70467467 0.74277068
 0.6570262  0.64853556 0.81798483 0.7178626 ]
Accuracy: 76.324% (7.813%)

As we can see the we can improve the accuracy of both SVM and Kfold has increased after performing pca.

In [ ]:
 
In [ ]: